start_date <- "2017-01-01"
end_date <- "2019-12-31"
f1<-function(d2, d1){
n_weeks <- floor(as.numeric(difftime(d2, d1, units="weeks")))
}
f2<-function(d2, d1){
n_weeks <- floor(as.numeric(difftime(as.Date(d2)
, as.Date(d1), units = "weeks")))
}
m1<-microbenchmark(
Nocast = f1(end_date, start_date),
Cast = f2(end_date, start_date),
times = 1000
)
print(m1)
## Unit: microseconds
## expr min lq mean median uq max neval
## Nocast 312.824 318.2595 336.2194 321.1050 329.5550 3687.196 1000
## Cast 112.270 115.7360 122.6819 116.9485 119.3275 2732.774 1000
fbox_plot(m1, "microseconds")
no_size <- function (n){
x <- c()
for (i in seq(n)) {
x <- c(x, i)
}
}
explicit_size <- function (n){
x <- vector("integer", n)
for (i in seq(n)) {
x[i] <- i
}
}
m3 <- microbenchmark(
no_size = no_size(1e4),
explicit_size = explicit_size(1e4),
times = 10
)
print(m3)
## Unit: microseconds
## expr min lq mean median uq max
## no_size 66875.765 67156.499 69265.9829 68431.696 69482.844 78992.936
## explicit_size 327.823 331.379 618.9222 348.501 368.569 3066.097
## neval
## 10
## 10
fbox_plot(m3, "microseconds")
vector <- runif(1e8)
w1 <- function(x){
d <- length(which(x > .5))
}
w2 <- function(x){
d <- sum(x > .5)
}
m4 <- microbenchmark(
which = w1(vector),
nowhich = w2(vector),
times = 10
)
print(m4)
## Unit: milliseconds
## expr min lq mean median uq max neval
## which 625.5163 626.5470 657.6817 629.4245 633.2491 820.3012 10
## nowhich 218.1222 223.0192 233.1465 223.8137 224.2000 312.0730 10
fbox_plot(m4, "miliseconds")
n <- 1e4
dt <- data.table(
a = seq(n), b = runif(n)
)
v1 <- function(dt){
d <- mean(dt[dt$b > .5, ]$a)
}
v2 <- function(dt){
d <- mean(dt$a[dt$b > .5])
}
m5 <- microbenchmark(
row_operation = v1(dt),
column_operation = v2(dt),
times = 10
)
print(m5)
## Unit: microseconds
## expr min lq mean median uq max neval
## row_operation 200.966 212.988 973.2879 235.1795 293.268 5437.906 10
## column_operation 77.234 85.930 310.4828 94.5820 97.922 2217.411 10
fbox_plot(m5, "microseconds")
The function seq prevents when the second part of the 1:x is zero
num <- 1e7
s1 <- function(num){
d <- mean(1:num)
}
s2 <- function(num){
d <- mean(seq(num))
}
m6<-microbenchmark(
noseq = s1(num),
seq = s2(num),
times = 30
)
print(m6)
## Unit: milliseconds
## expr min lq mean median uq max neval
## noseq 69.75186 69.90705 70.00728 69.95253 70.01261 71.40059 30
## seq 69.75907 69.91578 70.00642 69.95210 70.01234 71.68866 30
fbox_plot(m6, "miliseconds")
large_dataset <- data.table(
id = 1:1000000,
value = sample(letters, 1000000, replace = TRUE)
)
a1 <- function(x){
d <- x |> mutate(code = paste0(id, "_", value))
}
a2 <- function(x){
d <- x |> mutate(code = glue("{id}_{value}"))
}
m7 <- microbenchmark(
with_paste = a1(large_dataset),
with_glue = a2(large_dataset),
times = 20
)
print(m7)
## Unit: milliseconds
## expr min lq mean median uq max neval
## with_paste 559.8918 565.9926 593.8644 567.9682 573.3562 1050.1336 20
## with_glue 589.1190 596.2724 601.1875 600.4088 605.6657 620.1408 20
fbox_plot(m7, "miliseconds")
# Create a large list
big_list <- replicate(1e5, rnorm(10), simplify = FALSE)
m8 <- microbenchmark(
lapply = lapply(big_list, mean),
for_loop = {
result <- list()
for (i in seq_along(big_list)) {
result[[i]] <- mean(big_list[[i]])
}
},
times = 10
)
print(m8)
## Unit: milliseconds
## expr min lq mean median uq max neval
## lapply 309.1089 310.4311 346.7663 321.3544 346.2603 560.2860 10
## for_loop 337.9439 351.4920 373.4576 371.2677 392.8545 426.0713 10
fbox_plot(m8, "miliseconds")
dt <- data.table(
Date = as.Date('2023-01-01') + 0:99999,
iDate = as.IDate('2023-01-01') + 0:99999,
Value = rnorm(100000)
)
nd1 <- as.Date('2023-01-01')
nd2 <- as.Date('2023-01-10')
id1 <- as.IDate('2023-01-01')
id2 <- as.IDate('2023-01-10')
date_between_operation <- function(nd1, nd2) {
dt |> filter(Date >= nd1 & Date <= nd2)
}
idate_between_operation <- function(id1, id2) {
dt |> _[data.table::between(iDate, id1, id2)]
}
m9 <- microbenchmark(
Date = date_between_operation(nd1, nd2),
iDate = idate_between_operation(id1, id2),
times = 200L
)
print(m9)
## Unit: microseconds
## expr min lq mean median uq max neval
## Date 1480.345 1540.232 1828.7724 1792.668 1989.911 3805.127 200
## iDate 575.745 604.078 726.0476 647.369 816.069 2259.550 200
fbox_plot(m9, "miliseconds")
switch_function <- function(x) {
switch(x,
"a" = "apple",
"b" = "banana",
"c" = "cherry",
"default")
}
case_when_function <- function(x) {
case_when(
x == "a" ~ "apple",
x == "b" ~ "banana",
x == "c" ~ "cherry",
TRUE ~ "default"
)
}
# Create a vector of test values
test_values <- sample(c("a", "b", "c", "d"), 1000, replace = TRUE)
m10 <- microbenchmark(
switch = sapply(test_values, switch_function),
case_when = sapply(test_values, case_when_function),
times = 200L
)
print(m10)
## Unit: microseconds
## expr min lq mean median uq max
## switch 637.09 649.678 779.6834 656.7265 679.204 10834.11
## case_when 216994.28 228406.518 234731.6331 234985.1145 237612.367 378996.25
## neval
## 200
## 200
fbox_plot(m10, "microseconds")
set.seed(123)
n <- 1e6
data <- data.table(
id = seq(n),
value = sample(seq(100), n, replace = TRUE)
)
casewhenf <- function(data){
df <- data |>
mutate(category = case_when(
value <= 20 ~ "Low",
value <= 70 ~ "Medium",
value > 70 ~ "High"))
}
fcasef <- function(data){
df <- data |>
mutate(category = fcase(
value <= 20, "Low",
value <= 70, "Medium",
value > 70, "High"))
}
m11 <- microbenchmark(
case_when = casewhenf(data),
fcase = fcasef(data),
times = 20
)
print(m11)
## Unit: milliseconds
## expr min lq mean median uq max neval
## case_when 56.62328 61.99697 65.01510 62.64978 70.75990 72.87325 20
## fcase 21.10005 21.51619 23.83171 22.87912 26.33919 27.37896 20
fbox_plot(m11, "miliseconds")
set.seed(123)
DT <- data.table(
ID = 1:1e6,
Value1 = sample(c(NA, 1:100), 1e6, replace = TRUE),
Value2 = sample(c(NA, 101:200), 1e6, replace = TRUE)
)
# Define the functions
replace_na_f <- function(data){
DF <- data |>
mutate(Value1 = replace_na(Value1, 0),
Value2 = replace_na(Value2, 0)) |>
as.data.table()
}
fcoalesce_f <- function(data){
DF <- data |>
mutate(Value1 = fcoalesce(Value1, 0L),
Value2 = fcoalesce(Value2, 0L))
}
m12 <- microbenchmark(
treplace_na = replace_na_f(DT),
tfcoalesce = fcoalesce_f(DT),
times = 20
)
print(m12)
## Unit: milliseconds
## expr min lq mean median uq max neval
## treplace_na 7.329550 7.402150 8.147317 7.609567 8.725226 11.164893 20
## tfcoalesce 1.498779 1.569402 1.900937 1.738797 1.983575 4.281567 20
fbox_plot(m12, "miliseconds")
dt <- data.table(field_name = c("argentina.blue.man.watch",
"brazil.red.woman.shoes",
"canada.green.kid.hat",
"denmark.red.man.shirt"))
# Filter rows where 'field_name' does not contain 'red'
dtnot <- function(data){
filtered_dt <- data |> _[!grepl("red", field_name)]
}
dplyrnot <- function(data){
filtered_dt <- data |> filter(!grepl("red", field_name))
}
m13 <- microbenchmark(
data_table_not = dtnot(dt),
dplyrnot = dplyrnot(dt),
times = 100
)
print(m13)
## Unit: microseconds
## expr min lq mean median uq max neval
## data_table_not 102.762 112.019 145.5976 122.188 137.908 2055.259 100
## dplyrnot 654.934 673.423 714.3984 687.980 703.454 2698.680 100
fbox_plot(m13, "microseconds")
large_data <- data.table(
id = 1:100000,
var1 = rnorm(100000),
var2 = rnorm(100000),
var3 = rnorm(100000),
var4 = rnorm(100000)
)
# Benchmarking
m14 <- microbenchmark(
tidyr_pivot_longer = {
long_data_tidyr <- pivot_longer(large_data, cols = starts_with("var"),
names_to = "variable", values_to = "value")
},
data_table_melt = {
long_data_dt <- melt(large_data, id.vars = "id", variable.name = "variable",
value.name = "value")
},
times = 10
)
print(m14)
## Unit: microseconds
## expr min lq mean median uq max
## tidyr_pivot_longer 6022.238 6069.876 7901.087 6204.218 6327.267 23319.004
## data_table_melt 417.550 478.313 640.305 573.722 736.386 1128.397
## neval
## 10
## 10
fbox_plot(m14, "microseconds")
vec1 <- seq(1000)
vec2 <- seq(1000)
# Define functions to be benchmarked
expand_grid_func <- function() {
return(expand_grid(vec1, vec2))
}
CJ_func <- function() {
return(CJ(vec1, vec2))
}
# Perform benchmarking
m15 <- microbenchmark(
expand_grid = expand_grid_func(),
CJ = CJ_func(),
times = 10
)
print(m15)
## Unit: microseconds
## expr min lq mean median uq max neval
## expand_grid 2163.230 2190.411 2425.5183 2247.9130 2312.630 3536.465 10
## CJ 413.733 474.175 672.9749 491.7235 641.598 1841.559 10
fbox_plot(m15, "microseconds")
# Sample data
size = 1e4
set.seed(44)
df_list <- replicate(50, data.table(id = sample(seq(size), size, replace = T),
value = rnorm(size)), simplify = F)
simple_bind <- function(list_of_dfs){
do.call(rbind, list_of_dfs)
}
dplyr_bind <- function(list_of_dfs){
bind_rows(list_of_dfs)
}
dt_bind <- function(list_of_dfs){
rbindlist(list_of_dfs, fill = F)
}
# Benchmark both methods
m16 <- microbenchmark(
dt_ver = dt_bind(df_list),
simple = simple_bind(df_list),
dplyr_ver = dplyr_bind(df_list),
times = 30
)
print(m16)
## Unit: microseconds
## expr min lq mean median uq max neval
## dt_ver 428.601 472.473 561.4821 492.369 530.851 1978.275 30
## simple 461.763 511.315 590.4702 547.462 589.942 1872.607 30
## dplyr_ver 10150.869 10211.222 10382.2901 10305.828 10429.990 11518.713 30
fbox_plot(m16, "microseconds")
set.seed(123)
n <- 1e4
df <- data.table(text = paste("word1", "word2", "word3", "word4", "word5", sep = "."), stringsAsFactors = F)
df <- df[rep(1, n), , drop = F]
# Using tidyr::separate
separate_words <- function() {
df |>
separate(text, into = c("w1", "w2", "w3", "w4", "w5"), sep = "\\.", remove = F) |>
select(-c(w1, w2, w4))
}
# Using stringr::word
stringr_words <- function() {
df |>
mutate(
w3 = word(text, 3, sep = fixed(".")),
w5 = word(text, 5, sep = fixed("."))
)
}
datatable_words <- function() {
df |> _[, c("w3", "w5") := tstrsplit(text, "\\.")[c(3, 5)]]
}
m17 <- microbenchmark(
separate = separate_words(),
stringr = stringr_words(),
dt = datatable_words(),
times = 10
)
print(m17)
## Unit: milliseconds
## expr min lq mean median uq max neval
## separate 77.46553 79.14278 87.32748 87.05495 92.41392 104.79169 10
## stringr 168.04675 181.23135 190.41033 187.09969 205.81570 216.31872 10
## dt 12.42029 12.49104 12.95238 12.53949 13.01107 15.22234 10
fbox_plot(m17, "miliseconds")
# Sample data
set.seed(123)
n <- 1e6
df <- data.table(
x = rnorm(n),
y = sample(c(NA, 1:100), n, replace = TRUE),
z = sample(c(NA, letters), n, replace = TRUE),
stringsAsFactors = F
)
# Benchmark both methods
m18 <- microbenchmark(
dplyr_drop_na = {
df |> drop_na()
},
data_table_na_omit = {
dt |> na.omit()
},
times = 10
)
print(m18)
## Unit: microseconds
## expr min lq mean median uq max
## dplyr_drop_na 9331.419 9343.972 9571.2625 9371.2625 9862.661 10352.135
## data_table_na_omit 9.017 9.217 44.8258 42.3895 58.099 146.984
## neval
## 10
## 10
fbox_plot(m18, "microseconds")
# Sample data
set.seed(123)
size = 1e4
n_cores = parallelly::availableCores()
df_list <- replicate(100, data.table(id = sample(seq(size), size, replace = T),
value = rnorm(size)), simplify = F)
extra_df <- data.table(id = sample(seq(size), size, replace = T),
extra_value = runif(size))
# Sequential join
sequential_join <- function() {
lapply(df_list, function(df) {
merge(df, extra_df, by = "id", allow.cartesian = T)
})
}
# Parallel join using mclapply
parallel_join <- function() {
mclapply(df_list, function(df) {
merge(df, extra_df, by = "id", allow.cartesian = T)
}, mc.cores = n_cores, mc.silent = T, mc.cleanup = T)
}
# Benchmark both methods
m19 <- microbenchmark(
sequential = sequential_join(),
parallel = parallel_join(),
times = 10
)
print(m19)
## Unit: milliseconds
## expr min lq mean median uq max neval
## sequential 277.4242 300.0491 326.9741 326.1507 349.3709 406.3814 10
## parallel 123.0325 132.2365 144.4839 141.8593 157.2490 173.9063 10
fbox_plot(m19, "miliseconds")
This is another alternative (You need to install this package)
set.seed(123)
n <- 1e7
df <- data.table(
group1 = sample(LETTERS[1:10], n, replace = TRUE),
group2 = sample(letters[1:5], n, replace = TRUE),
value1 = rnorm(n),
value2 = runif(n, 1, 100)
)
m21 <- microbenchmark(
basic_way = {
dplyr <- df |>
filter(value1 > 0) |>
mutate(ratio = value1 / value2) |>
summarize(
mean_val1 = mean(value1),
sd_val1 = sd(value1),
median_val2 = median(value2),
max_ratio = max(ratio), .by = c("group1", "group2")) |>
as.data.table()
},
dtplyr_way = {
dtplyr = df |>
lazy_dt() |>
filter(value1 > 0) |>
mutate(ratio = value1 / value2) |>
summarize(
mean_val1 = mean(value1),
sd_val1 = sd(value1),
median_val2 = median(value2),
max_ratio = max(ratio), .by = c("group1", "group2")) |>
as.data.table()
},
times = 5
)
print(m21)
## Unit: milliseconds
## expr min lq mean median uq max neval
## basic_way 563.1599 565.5667 595.0042 592.8340 604.5774 648.8830 5
## dtplyr_way 450.4852 476.5814 480.8106 478.6591 481.8928 516.4344 5
fbox_plot(m21, "miliseconds")
with_parquet <- function(){
fp_data <- "/conf/posit_azure_logs/data"
data_1 <- open_dataset(file.path(glue::glue("{fp_data}/golden_data_in_progress"))) |>
select(
date, hours, time,
ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
ALL_WIP_CP_node_total, ALL_WIP_BP_node_total
) |>
mutate(
computepool_node_mem = ALL_WIP_CP_node_total * (160 * 1024),
bigpool_node_mem = ALL_WIP_BP_node_total * (256 * 1024),
ALL_WIP_day_session = ALL_WIP_CP_day_session + ALL_WIP_BP_day_session,
ALL_WIP_night_session = ALL_WIP_CP_night_session + ALL_WIP_BP_night_session,
ALL_WIP_node_total = ALL_WIP_CP_node_total + ALL_WIP_BP_node_total,
total_mem_limit = ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit,
total_mem_request = ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request,
total_mem_max = ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max,
total_node_mem = computepool_node_mem + bigpool_node_mem,
average_session_per_node = ifelse(ALL_WIP_node_total != 0,
(ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total, 0)
) |>
collect() |>
as.data.table()
}
with_duckfile <- function(){
file.copy("/conf/posit_azure_logs/gatzos01/gd_inprogress.duckdb", "gd_inprogress.duckdb")
data_2 <- res_duckdb_sql <- dbGetQuery(
conn = dbConnect(duckdb::duckdb(), dbdir = "./gd_inprogress.duckdb"),
statement = glue("select date, hours, time,
ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
ALL_WIP_CP_node_total, ALL_WIP_BP_node_total,
ALL_WIP_CP_node_total * 160 * 1024 as computepool_node_mem,
ALL_WIP_BP_node_total * 256 * 1024 as bigpool_node_mem,
ALL_WIP_CP_day_session + ALL_WIP_BP_day_session as ALL_WIP_day_session,
ALL_WIP_CP_night_session + ALL_WIP_BP_night_session as ALL_WIP_night_session,
ALL_WIP_CP_node_total + ALL_WIP_BP_node_total as ALL_WIP_node_total,
ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit as total_mem_limit,
ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request as total_mem_request,
ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max as total_mem_max,
computepool_node_mem + bigpool_node_mem as total_node_mem,
CASE
WHEN ALL_WIP_node_total != 0 THEN (ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total
ELSE 0
END AS average_session_per_node
from gdinprog"),
immediate = TRUE) |>
as.data.table()
file.remove("./gd_inprogress.duckdb")
}
m22 <- microbenchmark(
with_parquet = with_parquet(),
with_duckfile = with_duckfile(),
times = 3
)
print(m22)
## Unit: milliseconds
## expr min lq mean median uq max
## with_parquet 25053.1324 25085.4129 28886.5502 25117.6934 30803.2591 36488.825
## with_duckfile 508.7684 509.5344 711.8886 510.3004 813.4488 1116.597
## neval
## 3
## 3
fbox_plot(m22, "miliseconds")